The purpose of this file is processing the combined data files for Summer 2022 into files that contain only valid data for analysis, excluding invalid sessions and participants
Data is imported from 2 files, indicating two levels of analysis: participants and blocks (item-level).
Note: mouse-cursor data contained in final_mouse_blocks.json file is not handled here.
#IMPORT DATA
df_participants <- fromJSON("input/su22_sgc4d_final_participants.json")
df_items <- fromJSON('input/su22_sgc4d_final_items.json')
#add term indicator
df_participants$term <- "summer22"
df_items$term <- "summer22"
#DEFINE SGC_4D validity crieria
sessions <- c('suPROLIFIC') #SGC4D running on prolific
conditions <-c(11111112,11311112) #2 conditions
violation_threshold = 4 #number of allowable browser violations
effort_exclusion = c("I didn't try very hard, or rushed through the questions", "I started out trying hard, but gave up at some point")
n_items = 15 #fifteen items is complete dataset per participant
#placeholder for excluding participants
ex_participants = data.frame()
note : We drop all scores calculated in the stimulus engine (except absolute score, which uses simple # strictly correct), as they are recalculate during analysis using a different MC scoring algorithm.
#create factors in PARTICIPANTS
df_participants <- df_participants %>%
mutate( #create factors and remove extraneous ""
subject=as.character(subject),
condition=as.character(condition),
pretty_condition = recode_factor(condition,
"11111112" = "ORTH-equilateral",
"11311112" = "TRI-equilateral"),
study = factor(study),
session = factor(session),
exp_id = factor(exp_id),
sona_id = as.character(sona_id),
pool = factor(pool),
mode = factor(mode),
attn_check = factor(attn_check),
status=factor(status),
term=factor(term),
gender = as.factor(gender),
age = as.integer(age),
country = gsub('"',"",country),
year = factor(schoolyear),
major = factor(major),
browser = factor(browser),
os = factor(os),
native_language = factor(language),
totaltime_m = totaltime/1000/60,
) %>% select( #order cols
subject,
study,
condition,
pretty_condition,
session,
exp_id,
sona_id,
pool,
mode,
attn_check,
# explanation,
effort,
difficulty,
confidence,
enjoyment,
other,
age,
country,
language,
schoolyear,
major,
gender,
disability,
browser,
width,
height,
os,
starttime,
status,
term,
violations,
absolute_score,
# discriminant_score,
# tri_score,
# orth_score,
# other_score,
# blank_score,
totaltime_m
)
#NOT THAT WE DROP ALL SCORES, WHICH ARE INCORRECTLY CALCULATED IN THE stimulus engine. We do not drop the raw responses (answers)
df_items <- df_items %>%
mutate(
# subject=factor(subject),
# condition=factor(condition),
pretty_condition = recode_factor(condition,
"11111112" = "ORTH-equilateral",
"11311112" = "TRI-equilateral"),
pool=factor(pool),
mode = factor(mode),
# explicit=factor(explicit),
# impasse = factor(impasse),
# grid = factor(grid),
# mark = factor(mark),
# ixn = factor(ixn),
term=factor(term),
relation = factor(relation),
block = factor(block),
correct = factor(correct),
q=factor(q),
rt_s = rt/1000,
time_elapsed_m = time_elapsed/1000/60
) %>% select(
subject,
study,
term,
pool,
mode,
condition,
pretty_condition,
block,
explicit,
impasse,
grid,
mark,
ixn,
gwidth,
gheight,
graph,
time_elapsed_m,
question,
relation,
q,
correct,
# discriminant,
# tri_score,
# orth_score,
# other_score,
# blank_score,
answer,
rt_s
) #WE DROP ALL SCORES BC THEY ARE RESCORED IN ANALYSIS FILE
Starting with Winter 2022, data are saved to the database even if the subject’s browser did not meet minimum specifications (at which point they are prompted to change browsers, or end the study). This allows us to learn about the browsers, screen sizes and OS that (potential) subjects are using. However, these data are not exported from the database for analysis (see flatten.js and status.js scripts). Thus, only subjects who successfully completed the entire study are included in this file.
#MANUALLY INSPECT status
df_participants %>% group_by(status) %>%
dplyr::summarize(n=n())
## # A tibble: 1 × 2
## status n
## <fct> <int>
## 1 success 122
122 successfully completed the study.
#DISCARD participants from invalid sessions
exclude_status <- df_participants %>%
filter(status != "success") %>%
mutate(reason="invalid-status")
ex_participants <- rbind(ex_participants, exclude_status)
rm(exclude_status)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No data need to be excluded on account of completion status.
Participants are randomly assigned to an experimental condition when starting the study. Here we validate that only conditions for the current study are included in this dataset.
#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>%
dplyr::summarize(n=n())
## # A tibble: 2 × 2
## condition n
## <chr> <int>
## 1 11111112 60
## 2 11311112 62
Data from conditions not corresponding to valid conditions should be discarded.
#DISCARD participants from conditions invalid for this study
exclude_condition <- df_participants %>%
filter(!condition %in% conditions) %>%
mutate(reason="invalid-condition")
ex_participants <- rbind(ex_participants, exclude_condition)
rm(exclude_condition)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No data need to be excluded on account of condition.
The (string) session code is embedded in the URL
querystring by the experimenter to differentiate testing sessions in
SONA from demo and other environment setup tasks.
#MANUALLY INSPECT sessions
df_participants %>% group_by(session) %>%
dplyr::summarize(n=n())
## # A tibble: 1 × 2
## session n
## <fct> <int>
## 1 suPROLIFIC 122
Data from sessions not corresponding to valid sessions should be discarded.
#DISCARD participants from invalid sessions
exclude_session <- df_participants %>%
filter(!session %in% sessions) %>%
mutate(reason="invalid-session")
ex_participants <- rbind(ex_participants, exclude_session)
rm(exclude_session)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No participants are excluded on account of session (ie. app testing or pilot session).
Browser interaction data is recorded by jspsych allowing us to determine if subjects violate our instructions not to leave the browser tab (or exit fullscreen mode) during test. These incidents are recorded in jspsych interaction data object, and the number of violations is counted and added to the participant data file.
Due to eccentricity of the browser events captured, 1-2 browser violations can be captured even if the subject did not leave the browser window (eg. in case of resizing window to meet minimum requirements.)
#MANUALLY INSPECT violations
df_participants %>% group_by(violations) %>%
dplyr::summarize(n=n())
## # A tibble: 11 × 2
## violations n
## <dbl> <int>
## 1 1 73
## 2 1.5 2
## 3 2 21
## 4 2.5 2
## 5 3 11
## 6 3.5 4
## 7 4 5
## 8 4.5 1
## 9 5.5 1
## 10 6 1
## 11 7 1
#DISCARD participants exceeding the threshold of browser interaction violations
exclude_violations <- df_participants %>%
filter(violations > violation_threshold) %>%
mutate(reason="exceeded-violations")
ex_participants <- rbind(ex_participants, exclude_violations)
rm(exclude_violations)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
Four participants were excluded for exceeding the maximum allowed number of browser interaction violations.
To assist in mitigating increased noise in data collected asynchronously from the UCSD student subject pool, we added explicit ratings of how much effort the participant expended on the task. This question was implemented as a multiple-choice drop-down on an ‘Effort’ page prior to the ‘Demographics’ survey at the end of the study. Subjects were given four options : (1) I tried my best on each question, (2) I tried my best on most questions, (3) I started out trying hard, but gave up at some point, (4) I didn’t try very hard, or rushed through the questions.
#MANUALLY INSPECT effort
df_participants %>% group_by(effort) %>%
dplyr::summarize(n=n())
## # A tibble: 3 × 2
## effort n
## <chr> <int>
## 1 I started out trying hard, but gave up at some point 1
## 2 I tried my best on each question 110
## 3 I tried my best on most questions 7
Participants answering with options I didn’t try very hard, or rushed through the questions or I started out trying hard, but gave up at some point are excluded from analysis.
#DISCARD participants who indicated they did not expend adequate effort on the study
exclude_effort <- df_participants %>%
filter(effort %in% effort_exclusion) %>%
mutate(reason="selfrated-effort")
ex_participants <- rbind(ex_participants, exclude_effort)
rm(exclude_effort)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
One participant is excluded for low (self-rated) effort.
The 6th question in the study is non-discriminatory (can easily get correct answer regardless of strategy) and serves as an attention check question.
#MANUALLY INSPECT attention
df_participants %>% group_by(attn_check) %>%
dplyr::summarize(n=n())
## # A tibble: 2 × 2
## attn_check n
## <fct> <int>
## 1 FALSE 18
## 2 TRUE 99
Participants who answered the attention check question incorrectly should be excluded.
#DISCARD participants who indicated they did not expend adequate effort on the study
# exclude_attn <- df_participants %>%
# filter(attn_check == FALSE) %>%
# mutate(reason="failed-attnchk")
#
# ex_participants <- rbind(ex_participants, exclude_attn)
# rm(exclude_attn)
#
# df_participants <- df_participants %>%
# filter( ! subject %in% ex_participants$subject)
No participants are excluded for failing the attention check question.
Next, we need to discard item_level data for excluded participants.
ex_items <- df_items %>%
filter (subject %in% ex_participants$subject)
df_items <- df_items %>%
filter (!subject %in% ex_participants$subject )
After all exclusions, we are left with the following number of participants per condition:
#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>%
dplyr::summarize(n=n())
## # A tibble: 2 × 2
## condition n
## <chr> <int>
## 1 11111112 57
## 2 11311112 60
Finally, we need to validate we have a complete set of items for all valid participants.
count(df_items)[[1]] == count(df_participants)[[1]]* n_items
## [1] TRUE
#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html
#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4d_participants.csv", "csv") #import data dictionary
var_label(df_participants) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels
#ADD DATASET METATDATA
metadata(df_participants)$name <- "Experimental PARTICIPANTS for study SGC4D"
metadata(df_participants)$description <- "Data for study SGC4D summarized at PARTICIPANT level"
metadata(df_participants)$creator <- "Amy Rae Fox"
metadata(df_participants)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF KNIT
codebook::skim_codebook(df_participants)
| Name | data |
| Number of rows | 117 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| factor | 13 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| subject | 0 | 1 | 5 | 5 | 0 | 117 | 0 |
| condition | 0 | 1 | 8 | 8 | 0 | 2 | 0 |
| sona_id | 0 | 1 | 24 | 24 | 0 | 112 | 0 |
| effort | 0 | 1 | 32 | 33 | 0 | 2 | 0 |
| other | 0 | 1 | 0 | 402 | 59 | 58 | 0 |
| country | 0 | 1 | 2 | 44 | 0 | 11 | 0 |
| language | 0 | 1 | 7 | 7 | 0 | 1 | 0 |
| schoolyear | 0 | 1 | 7 | 27 | 0 | 7 | 0 |
| disability | 0 | 1 | 0 | 72 | 46 | 25 | 0 |
| starttime | 0 | 1 | 24 | 24 | 0 | 117 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| study | 0 | 1 | FALSE | 1 | SGC: 117 |
| pretty_condition | 0 | 1 | FALSE | 2 | TRI: 60, ORT: 57 |
| session | 0 | 1 | FALSE | 1 | suP: 117 |
| exp_id | 0 | 1 | FALSE | 4 | 630: 40, 630: 37, 630: 20, 630: 20 |
| pool | 0 | 1 | FALSE | 1 | pro: 117 |
| mode | 0 | 1 | FALSE | 1 | asy: 117 |
| attn_check | 0 | 1 | FALSE | 2 | TRU: 99, FAL: 18 |
| major | 0 | 1 | FALSE | 7 | Mat: 32, Soc: 26, Fin: 20, Hum: 18 |
| gender | 0 | 1 | FALSE | 3 | Mal: 59, Fem: 51, Oth: 7 |
| browser | 0 | 1 | FALSE | 1 | chr: 117 |
| os | 0 | 1 | FALSE | 5 | Win: 82, Mac: 27, Chr: 4, Win: 2 |
| status | 0 | 1 | FALSE | 1 | suc: 117 |
| term | 0 | 1 | FALSE | 1 | sum: 117 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | min | median | max | hist |
|---|---|---|---|---|---|---|---|---|
| difficulty | 0 | 1 | 3.54 | 0.98 | 1.00 | 4.00 | 5.00 | ▁▂▅▇▃ |
| confidence | 0 | 1 | 3.38 | 1.06 | 1.00 | 3.00 | 5.00 | ▁▃▇▇▃ |
| enjoyment | 0 | 1 | 3.45 | 1.20 | 1.00 | 4.00 | 5.00 | ▂▆▇▇▇ |
| age | 0 | 1 | 34.63 | 11.83 | 19.00 | 32.00 | 71.00 | ▇▆▃▂▁ |
| width | 0 | 1 | 1642.15 | 303.85 | 1143.00 | 1536.00 | 2752.00 | ▇▇▆▁▁ |
| height | 0 | 1 | 843.73 | 145.20 | 685.00 | 785.00 | 1329.00 | ▇▃▃▁▁ |
| violations | 0 | 1 | 1.62 | 0.91 | 1.00 | 1.00 | 4.00 | ▇▂▁▁▁ |
| absolute_score | 0 | 1 | 1.79 | 3.46 | 0.00 | 0.00 | 12.00 | ▇▁▁▁▁ |
| totaltime_m | 0 | 1 | 12.47 | 6.45 | 2.15 | 11.31 | 38.43 | ▇▇▂▁▁ |
codebook(df_participants, #ONLY FOR HTML KNIT
metadata_table = TRUE,
detailed_variables = FALSE,
detailed_scales = FALSE,
metadata_json = FALSE,
survey_overview = FALSE,
missingness_report = FALSE)
Dataset name: Experimental PARTICIPANTS for study SGC4D
Data for study SGC4D summarized at PARTICIPANT level
Date published: 2022-08-26
Creator:
| name | value |
|---|---|
| 1 | Amy Rae Fox |
|
|
#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html
#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4d_items.csv", "csv") #import data dictionary
var_label(df_items) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels
#ADD DATASET METATDATA
metadata(df_items)$name <- "Experimental ITEMS for study SGC4D"
metadata(df_items)$description <- "Data for study SGC4D summarized at participant-item level"
metadata(df_items)$creator <- "Amy Rae Fox"
metadata(df_items)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF EXPORT
skim_codebook(df_items)
| Name | data |
| Number of rows | 1755 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| factor | 8 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| subject | 0 | 1 | 5 | 5 | 0 | 117 | 0 |
| study | 0 | 1 | 5 | 5 | 0 | 1 | 0 |
| condition | 0 | 1 | 8 | 8 | 0 | 2 | 0 |
| explicit | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| impasse | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| grid | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| mark | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| ixn | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| graph | 0 | 1 | 10 | 10 | 0 | 1 | 0 |
| question | 0 | 1 | 26 | 87 | 0 | 15 | 0 |
| answer | 0 | 1 | 0 | 25 | 42 | 94 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| term | 0 | 1 | FALSE | 1 | sum: 1755 |
| pool | 0 | 1 | FALSE | 1 | pro: 1755 |
| mode | 0 | 1 | FALSE | 1 | asy: 1755 |
| pretty_condition | 0 | 1 | FALSE | 2 | TRI: 900, ORT: 855 |
| block | 0 | 1 | FALSE | 3 | ite: 819, ite: 585, ite: 351 |
| relation | 0 | 1 | FALSE | 10 | end: 234, mee: 234, mid: 234, sta: 234 |
| q | 0 | 1 | FALSE | 15 | 1: 117, 2: 117, 3: 117, 4: 117 |
| correct | 0 | 1 | FALSE | 2 | FAL: 1340, TRU: 415 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | min | median | max | hist |
|---|---|---|---|---|---|---|---|---|
| gwidth | 0 | 1 | 600.00 | 0.00 | 600.00 | 600.00 | 600.00 | ▁▁▇▁▁ |
| gheight | 0 | 1 | 600.00 | 0.00 | 600.00 | 600.00 | 600.00 | ▁▁▇▁▁ |
| time_elapsed_m | 0 | 1 | 6.88 | 5.22 | 0.35 | 5.71 | 37.39 | ▇▃▁▁▁ |
| rt_s | 0 | 1 | 34.79 | 38.23 | 0.14 | 22.03 | 425.35 | ▇▁▁▁▁ |
codebook(df_items,#ONLY FOR HTML EXPORT
metadata_table = TRUE,
detailed_variables = FALSE,
detailed_scales = FALSE,
metadata_json = FALSE,
survey_overview = FALSE,
missingness_report = FALSE)
Dataset name: Experimental ITEMS for study SGC4D
Data for study SGC4D summarized at participant-item level
Date published: 2022-08-26
Creator:
| name | value |
|---|---|
| 1 | Amy Rae Fox |
|
|
Exploration of the distribution of key response variables for validation purposes:
gf_histogram( ~absolute_score ,data = df_participants) +
labs(title = "SGC4D Distribution of Absolute Score")
gf_dhistogram( ~absolute_score ,data = df_participants) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC4D Distribution of Absolute Score (by Condition)")
gf_props(~correct, data = df_items) +
labs(title = "SGC4D Distribution of Item Absolute Score")
gf_props(~correct, data = df_items) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC4D Distribution of Item Absolute Score (by Condition)")
gf_histogram( ~totaltime_m ,data = df_participants) +
labs(title = "SGC4D Distribution of Total Study Time")
gf_histogram( ~absolute_score ,data = df_participants) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC4D Distribution of Absolute Score")
gf_histogram(~rt_s, data = df_items) +
labs(title = "SGC4D Distribution of Item Response Time")
gf_jitter(totaltime_m ~ absolute_score , data = df_participants) +
labs(title = "SGC4D Item Response Time vs Accuracy")
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats( data = df_participants, x = pretty_condition, y = absolute_score,
type = "nonparametric")
For transparency, we save and identify the excluded data.
write.csv(ex_participants,"output/excluded_participants_summer22_sgc4d.csv", row.names = FALSE)
write.csv(ex_items,"output/excluded_items_summer22_sgc4d.csv", row.names = FALSE)
#CSV files
write.csv(df_participants,"output/sgc4d_participants.csv", row.names = FALSE)
write.csv(df_items,"output/sgc4d_items.csv", row.names = FALSE)
#export R DATA STRUCTURES (include codebook metadata)
rio::export(df_participants, "output/sgc4d_participants.rds") # to R data structure file
rio::export(df_items, "output/sgc4d_items.rds") # to R data structure file